{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f06f667a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '2'\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0c19cc52",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer\n",
    "from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM, PeftModel\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5f185437",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('unsloth/Llama-3.2-3B-Instruct')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6c75237e",
   "metadata": {},
   "outputs": [],
   "source": [
    "streamer = TextStreamer(tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5a47df69",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at unsloth/Llama-3.2-3B-Instruct and are newly initialized: ['lm_head.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "ori_model = AutoModelForCausalLM.from_pretrained(\n",
    "    'unsloth/Llama-3.2-3B-Instruct', tie_word_embeddings=False, torch_dtype = torch.bfloat16,\n",
    ").cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "80330668",
   "metadata": {},
   "outputs": [],
   "source": [
    "ori_model.lm_head.weight.data = ori_model.model.embed_tokens.weight.data.clone()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "00736b5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "state_dict = ori_model.state_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "bb3dc3b9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'lora-embedding-256-llama3.2-3b-small-malaysian-reasoning/checkpoint-2400'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers.trainer_utils import get_last_checkpoint\n",
    "\n",
    "latest = get_last_checkpoint(\"lora-embedding-256-llama3.2-3b-small-malaysian-reasoning\")\n",
    "latest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "00d794b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from safetensors import safe_open\n",
    "\n",
    "f = safe_open(f\"{latest}/adapter_model.safetensors\", framework=\"pt\", device=0)\n",
    "keys = f.keys()\n",
    "keys = sorted(list(set([k.split('.lora')[0] for k in keys if '.lora' in k])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "88f54db1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [00:03<00:00, 59.69it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for k in tqdm(keys):\n",
    "    k_ori = k.replace('base_model.model.', '') + '.weight'\n",
    "    if 'embed_tokens' in k:\n",
    "        post_A = '.lora_embedding_A'\n",
    "        post_B = '.lora_embedding_B'\n",
    "    else:\n",
    "        post_A = '.lora_A.weight'\n",
    "        post_B = '.lora_B.weight'\n",
    "    A = k + post_A\n",
    "    B = k + post_B\n",
    "    \n",
    "    W = state_dict[k_ori]\n",
    "    if 'embed_tokens' not in k:\n",
    "        W = W.t()\n",
    "        \n",
    "    A = f.get_tensor(A).type(W.dtype)\n",
    "    B = f.get_tensor(B).type(W.dtype)\n",
    "    with torch.no_grad():\n",
    "        W.addmm_(A.t(), B.t(), alpha = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6d7d7fed",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/ssd3/llama-flex-attention-multipack/src/transformers/generation/configuration_utils.py:628: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.0` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
      "  warnings.warn(\n",
      "/home/husein/ssd3/llama-flex-attention-multipack/src/transformers/generation/configuration_utils.py:633: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.95` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
      "  warnings.warn(\n",
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
      "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
      "\n",
      "Cutting Knowledge Date: December 2023\n",
      "Today Date: 05 Jan 2025\n",
      "\n",
      "<|eot_id|><|start_header_id|>user<|end_header_id|>\n",
      "\n",
      "ape isu dengan najib<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n",
      "\n",
      "Najib Razak, yang berkhidmat sebagai Perdana Menteri Malaysia dari 2009 hingga 2018, telah menjadi subjek beberapa kontroversi dan skandal di Malaysia. Berikut adalah beberapa\n"
     ]
    }
   ],
   "source": [
    "d = [\n",
    "    {'role': 'user', 'content': 'ape isu dengan najib'}\n",
    "]\n",
    "\n",
    "inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt').to('cuda')\n",
    "generate_kwargs = dict(\n",
    "    input_ids=inputs,\n",
    "    max_new_tokens=50,\n",
    "    top_p=0.95,\n",
    "    top_k=50,\n",
    "    temperature=0.0,\n",
    "    do_sample=False,\n",
    "    repetition_penalty=1.05,\n",
    "    streamer=streamer\n",
    ")\n",
    "generation_output = ori_model.generate(**generate_kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "52538fbd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/ssd3/llama-flex-attention-multipack/src/transformers/generation/configuration_utils.py:628: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
      "  warnings.warn(\n",
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
      "\n",
      "Cutting Knowledge Date: December 2023\n",
      "Today Date: 05 Jan 2025\n",
      "\n",
      "<|eot_id|><|start_header_id|>user<|end_header_id|>\n",
      "\n",
      "camne nk code react, reply in jawi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n",
      "\n",
      "اونتوق ممبوات کومڤونين React يڠ موده اونتوق منچيڤتا بوراڠ دڠن تاجوق دان ڤيليهن \"\n"
     ]
    }
   ],
   "source": [
    "d = [\n",
    "    {'role': 'user', 'content': 'camne nk code react, reply in jawi'}\n",
    "]\n",
    "\n",
    "inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt').to('cuda')\n",
    "generate_kwargs = dict(\n",
    "    input_ids=inputs,\n",
    "    max_new_tokens=50,\n",
    "    top_p=0.95,\n",
    "    top_k=50,\n",
    "    temperature=0.9,\n",
    "    do_sample=False,\n",
    "    repetition_penalty=1.1,\n",
    "    streamer=streamer\n",
    ")\n",
    "generation_output = ori_model.generate(**generate_kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a1686261",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
      "\n",
      "Cutting Knowledge Date: December 2023\n",
      "Today Date: 05 Jan 2025\n",
      "\n",
      "<|eot_id|><|start_header_id|>user<|end_header_id|>\n",
      "\n",
      "camne nk code react, reply in sabah<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n",
      "\n",
      "Bah, kunuk sia mau kasi tau macam mana mo bikin komponen React yang boleh pakai untuk tingu senarai barang. Kita guna hook `useState` sama `useEffect` dari\n"
     ]
    }
   ],
   "source": [
    "d = [\n",
    "    {'role': 'user', 'content': 'camne nk code react, reply in sabah'}\n",
    "]\n",
    "\n",
    "inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt').to('cuda')\n",
    "generate_kwargs = dict(\n",
    "    input_ids=inputs,\n",
    "    max_new_tokens=50,\n",
    "    top_p=0.95,\n",
    "    top_k=50,\n",
    "    temperature=0.9,\n",
    "    do_sample=False,\n",
    "    repetition_penalty=1.1,\n",
    "    streamer=streamer\n",
    ")\n",
    "generation_output = ori_model.generate(**generate_kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5111d20d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ba58556cf6354bb08850e517b5e2c9da",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/Llama-3.2-3B-Malaysian-Reasoning/commit/00085d8a02cd4d86ea042e24677db154d0a4200a', commit_message='Upload tokenizer', commit_description='', oid='00085d8a02cd4d86ea042e24677db154d0a4200a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mesolitica/Llama-3.2-3B-Malaysian-Reasoning', endpoint='https://huggingface.co', repo_type='model', repo_id='mesolitica/Llama-3.2-3B-Malaysian-Reasoning'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('mesolitica/Llama-3.2-3B-Malaysian-Reasoning')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b0f3c1ed",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cc69a78097194e398fa3a59de54cf950",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-01-05 15:31:52,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/bin/ld: cannot find -laio\n",
      "collect2: error: ld returned 1 exit status\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'\n",
      "/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'\n",
      "collect2: error: ld returned 1 exit status\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0dd3490686284f3cb8875f5b293f03c4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00002-of-00002.safetensors:   0%|          | 0.00/2.25G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6e4d74fe346d4751bc3ee46845d435a7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6060e53a26954dc4bc9c496ba9a44ca4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/Llama-3.2-3B-Malaysian-Reasoning/commit/270a323bcb5c78f31c1c3fd0569088ca5a43ae28', commit_message='Upload LlamaForCausalLM', commit_description='', oid='270a323bcb5c78f31c1c3fd0569088ca5a43ae28', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mesolitica/Llama-3.2-3B-Malaysian-Reasoning', endpoint='https://huggingface.co', repo_type='model', repo_id='mesolitica/Llama-3.2-3B-Malaysian-Reasoning'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ori_model.push_to_hub('mesolitica/Llama-3.2-3B-Malaysian-Reasoning')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3.10",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
